from bs4 import BeautifulSoup
import bs4
import pandas as pd

file = "../output/41"
html_text = ""

with open(file,'rt',encoding='utf-8') as f:
    html_text = f.read()

dom = BeautifulSoup(html_text, 'lxml')
print('title: %s' % dom.title.text)

#清洗，去除script标签
[s.extract() for s in dom.findAll('script')]
[s.extract() for s in dom.findAll('style')]

root_nodes = []

def explore_child(node):
    if isinstance(node,str) & (node == '\n'):
        return ;
    if isinstance(node,bs4.element.Comment):
        return ;
    if not isinstance(node, bs4.element.Tag):
        root_nodes.append(node)
        return ;
    if len(list(node.children)) == 0:
        root_nodes.append(node)
        return ;
    for n in node.children:
        explore_child(n)

explore_child(dom.body)

print('root nodes count: %d' % len(root_nodes))

root_node_text = []
for n in root_nodes:
    if isinstance(n,str):
        root_node_text.append(n)
    elif isinstance(n,bs4.element.NavigableString):
        root_node_text.append(str(n))
    elif isinstance(n,bs4.element.Tag):
        root_node_text.append(n.text)

data = pd.DataFrame(root_node_text)
data.columns = ['text']
data['len'] = [len(t.strip()) for t in data.text]
